#!/usr/bin/env python
# coding=utf-8
# __author__ = 'Yunchao Ling'

import sys
import re
import random


def getContent(line, element):
    result = ""
    regex = "^.*<" + element + ">(.*)</" + element + ">$"
    match = re.match(regex, line)
    if match:
        result = match.group(1)
    return result


def getSample(filepath, element, number_perfile):
    infile = open(filepath, "r")
    all_content = []
    for line in infile:
        line = line.rstrip()
        if line.find(element) != -1:
            content = getContent(line, element)
            if content != "":
                all_content.append(content)
    infile.close()

    sample_content = []
    for i in xrange(number_perfile):
        if len(all_content) != 0:
            sample_content.append(all_content.pop(random.randint(0, len(all_content) - 1)))
        else:
            break
    return sample_content


if __name__ == "__main__":
    element = sys.argv[1]
    totalnumber = int(sys.argv[2])
    final_list = {}

    while (len(final_list) <= totalnumber):
        randomnumber = random.randint(0, 2541)
        start = str(randomnumber * 10000 + 1)
        end = str((randomnumber + 1) * 10000)
        filepath = "pubmed_" + start + "_" + end + ".xml"
        sample_content = getSample(filepath, element, 100)
        if len(sample_content) != 0:
            for content in sample_content:
                final_list[content] = 0
    for item in final_list.keys():
        print item







